In [1]:
!pip install bs4
In [2]:
import re
import numpy as np
try:
from urllib.request import urlopen, Request
except ImportError:
from urllib2 import urlopen, Request
#import urllib.parse as p
#import urllib.request as ur
from bs4 import BeautifulSoup
In [3]:
f = urlopen("https://hackpad.com/ep/pad/static/PlmbAyVqqtQ")
the_html = f.read()
In [4]:
soup = BeautifulSoup(the_html, 'html.parser')
In [5]:
all_a = soup.find_all('a')
In [6]:
github_links = []
for a in all_a:
if 'href' in a.attrs and a.attrs['href'] is not None and 'github' in a.attrs['href']:
github_links.append(a)
In [7]:
#github_links
There are many duplicates and typos, so we need to clean a bit
In [8]:
clean_github_links = [str(a.attrs['href']).lower() for a in github_links]
clean_github_links = [link for link in clean_github_links if 'gist' not in link]
repo_links = []
repo_str = "https://github.com/{username}/{reponame}"
pattr = re.compile('github.com/([0-9a-zA-Z]+)/([a-z0-9\-A-Z]+).*')
for link in clean_github_links:
try:
username, repo = pattr.search(link).groups()
repo_links.append(repo_str.format(username=username, reponame=repo))
except:
# no matches
pass
repo_links = set(repo_links)
In [9]:
def is_valid_url(url):
try:
request = Request(url)
response = urlopen(request)
return True
except:
#The url wasn't valid
return False
In [10]:
valid_repo_links = []
for link in repo_links:
if is_valid_url(link):
valid_repo_links.append(link)
In [11]:
for link in valid_repo_links:
print(link)
In [12]:
np.savetxt("github_links.txt", valid_repo_links, fmt='%s')
In [ ]: